{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data cleaning\n",
    "After gathering all the data (see the report and Data_wrangling.ipynb) it is time to clean the data.\n",
    "The cleaning phase follow these steps:\n",
    "- Load all the collection txt files into a labeled pandas dataframe\n",
    "- Remove empty data points, and remove malicious data points of length 1\n",
    "- Remove any duplicates\n",
    "- Shuffle the dataset to remove any ordering bias\n",
    "- Store into a .csv file for visualization and also store the dataframe into a .pickle file to easily allow further computations on the data set"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step1\n",
    "Import dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import csv\n",
    "from IPython.display import display"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step2\n",
    "Compute a pandas dataframe with the payloads from the different collections  \n",
    "dataframe columns:  \n",
    "$<is\\_malicious> | <injection\\_type> | <payload>$  \n",
    "example: \n",
    "$1 | SQL | ' OR 1=1 LIMIT 1 #$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First 5 lines of SQL\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>payload</th>\n",
       "      <th>is_malicious</th>\n",
       "      <th>injection_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>﻿'\\n</td>\n",
       "      <td>1</td>\n",
       "      <td>SQL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a' or 1=1-- \\n</td>\n",
       "      <td>1</td>\n",
       "      <td>SQL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\"a\"\" or 1=1--\"\\n</td>\n",
       "      <td>1</td>\n",
       "      <td>SQL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>or a = a\\n</td>\n",
       "      <td>1</td>\n",
       "      <td>SQL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>a' or 'a' = 'a\\n</td>\n",
       "      <td>1</td>\n",
       "      <td>SQL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            payload  is_malicious injection_type\n",
       "0              ﻿'\\n             1            SQL\n",
       "1    a' or 1=1-- \\n             1            SQL\n",
       "2  \"a\"\" or 1=1--\"\\n             1            SQL\n",
       "3        or a = a\\n             1            SQL\n",
       "4  a' or 'a' = 'a\\n             1            SQL"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First 5 lines of XSS\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>payload</th>\n",
       "      <th>is_malicious</th>\n",
       "      <th>injection_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>﻿data:text/html;alert(1)/*,&lt;svg%20onload=eval(...</td>\n",
       "      <td>1</td>\n",
       "      <td>XSS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>'\"&gt;*/--&gt;&lt;/title&gt;&lt;/style&gt;&lt;/textarea&gt;&lt;/script%0A...</td>\n",
       "      <td>1</td>\n",
       "      <td>XSS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\" onclick=alert(1)//&lt;button ‘ onclick=alert(1)...</td>\n",
       "      <td>1</td>\n",
       "      <td>XSS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>';alert(String.fromCharCode(88,83,83))//';aler...</td>\n",
       "      <td>1</td>\n",
       "      <td>XSS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\"&gt;&gt;&lt;marquee&gt;&lt;img src=x onerror=confirm(1)&gt;&lt;/ma...</td>\n",
       "      <td>1</td>\n",
       "      <td>XSS</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             payload  is_malicious  \\\n",
       "0  ﻿data:text/html;alert(1)/*,<svg%20onload=eval(...             1   \n",
       "1  '\">*/--></title></style></textarea></script%0A...             1   \n",
       "2  \" onclick=alert(1)//<button ‘ onclick=alert(1)...             1   \n",
       "3  ';alert(String.fromCharCode(88,83,83))//';aler...             1   \n",
       "4  \">><marquee><img src=x onerror=confirm(1)></ma...             1   \n",
       "\n",
       "  injection_type  \n",
       "0            XSS  \n",
       "1            XSS  \n",
       "2            XSS  \n",
       "3            XSS  \n",
       "4            XSS  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First 5 lines of SHELL\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>payload</th>\n",
       "      <th>is_malicious</th>\n",
       "      <th>injection_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>() { 0; }; touch /tmp/blns.shellshock1.fail;\\n</td>\n",
       "      <td>1</td>\n",
       "      <td>SHELL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>() { _; } &gt;_[$($())] { touch /tmp/blns.shellsh...</td>\n",
       "      <td>1</td>\n",
       "      <td>SHELL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;&lt;&lt; %s(un='%s') = %u\\n</td>\n",
       "      <td>1</td>\n",
       "      <td>SHELL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>'+++ATH0\\n</td>\n",
       "      <td>1</td>\n",
       "      <td>SHELL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>/dev/null; touch /tmp/blns.fail ; echo\\n</td>\n",
       "      <td>1</td>\n",
       "      <td>SHELL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             payload  is_malicious  \\\n",
       "0     () { 0; }; touch /tmp/blns.shellshock1.fail;\\n             1   \n",
       "1  () { _; } >_[$($())] { touch /tmp/blns.shellsh...             1   \n",
       "2                             <<< %s(un='%s') = %u\\n             1   \n",
       "3                                         '+++ATH0\\n             1   \n",
       "4           /dev/null; touch /tmp/blns.fail ; echo\\n             1   \n",
       "\n",
       "  injection_type  \n",
       "0          SHELL  \n",
       "1          SHELL  \n",
       "2          SHELL  \n",
       "3          SHELL  \n",
       "4          SHELL  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First 5 lines of LEGAL\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>payload</th>\n",
       "      <th>is_malicious</th>\n",
       "      <th>injection_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>569993989\\n</td>\n",
       "      <td>0</td>\n",
       "      <td>LEGAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>46201\\n</td>\n",
       "      <td>0</td>\n",
       "      <td>LEGAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Indianapolis\\n</td>\n",
       "      <td>0</td>\n",
       "      <td>LEGAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>20354328\\n</td>\n",
       "      <td>0</td>\n",
       "      <td>LEGAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A8Cyj4uzrSgkGg4szKuHeI\\n</td>\n",
       "      <td>0</td>\n",
       "      <td>LEGAL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    payload  is_malicious injection_type\n",
       "0               569993989\\n             0          LEGAL\n",
       "1                   46201\\n             0          LEGAL\n",
       "2            Indianapolis\\n             0          LEGAL\n",
       "3                20354328\\n             0          LEGAL\n",
       "4  A8Cyj4uzrSgkGg4szKuHeI\\n             0          LEGAL"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def from_txt_to_dataframe(src_file,is_malicious,injection_type):\n",
    "    \n",
    "    #read file\n",
    "    payloads_txt = open('data/{}.txt'.format(src_file),'r',encoding='UTF-8').readlines()\n",
    "    \n",
    "    #create dataframe\n",
    "    payloads = pd.DataFrame(payloads_txt,columns=['payload'])\n",
    "    payloads['is_malicious'] = [is_malicious]*len(payloads)\n",
    "    payloads['injection_type'] = [injection_type]*len(payloads)\n",
    "\n",
    "    print('First 5 lines of ' + injection_type)\n",
    "    display(payloads.head())\n",
    "    \n",
    "    return payloads\n",
    "    \n",
    "#concatenate all payload dataframes together\n",
    "payloads = pd.DataFrame(columns=['payload','is_malicious','injection_type'])\n",
    "payloads = payloads.append(from_txt_to_dataframe('SQLCollection',1,'SQL'))\n",
    "payloads = payloads.append(from_txt_to_dataframe('XSSCollection',1,'XSS'))\n",
    "payloads = payloads.append(from_txt_to_dataframe('ShellCollection',1,'SHELL'))\n",
    "payloads = payloads.append(from_txt_to_dataframe('non-maliciousCollection',0,'LEGAL'))\n",
    "payloads = payloads.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step2\n",
    "clean the data by\n",
    "- removing ending '\\n'\n",
    "- removing duplicates\n",
    "- removing empty data points\n",
    "- removing malicious data points with length 1\n",
    "- shuffle the data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Empty data points removed: 1\n",
      "Malicious data points of size 1 removed: 3\n",
      "Duplicate data points removed: 26072\n",
      "null/NaN data points removed: 3\n"
     ]
    }
   ],
   "source": [
    "#Remove ending \\n and white spaces\n",
    "payloads['payload'] = payloads['payload'].str.strip('\\n')\n",
    "payloads['payload'] = payloads['payload'].str.strip()\n",
    "\n",
    "#Remove any empty data points\n",
    "rows_before = len(payloads['payload'])\n",
    "payloads = payloads[payloads['payload'].str.len() != 0]\n",
    "print('Empty data points removed: ' + str(rows_before - len(payloads)))\n",
    "\n",
    "#Remove any malicious data points of size 1\n",
    "rows_before = len(payloads['payload'])\n",
    "payloads = payloads[(payloads['is_malicious'] == 0) | ((payloads['is_malicious'] == 1) & (payloads['payload'].str.len() > 1))]\n",
    "print('Malicious data points of size 1 removed: ' + str(rows_before-len(payloads)))\n",
    "\n",
    "#Remove duplicates\n",
    "rows_before = len(payloads['payload'])\n",
    "payloads = payloads.drop_duplicates(subset='payload', keep='last')\n",
    "print('Duplicate data points removed: ' + str(rows_before-len(payloads)))\n",
    "\n",
    "#Reformat rows that have the format b'<payload>' into <payload>\n",
    "payloads['payload'] = [payload[2:-1] if payload.startswith(\"b'\") or payload.startswith('b\"') \n",
    "                        else payload for payload in payloads['payload']]\n",
    "\n",
    "#Shuffle dataset and reset indices again\n",
    "payloads = payloads.sample(frac=1).reset_index(drop=True)\n",
    "payloads.index.name = 'index'\n",
    "\n",
    "#Remove payloads that cant be saved into .csv using pandas, e.g. they will be null/NA/NaN\n",
    "payloads.to_csv('data/payloads.csv',encoding='UTF-8')\n",
    "#reload dataframe from saved .csv. The dataframe will contain a few null values\n",
    "payloads = pd.read_csv(\"data/payloads.csv\",index_col='index',encoding='UTF-8') \n",
    "rows_before = len(payloads['payload'])\n",
    "payloads = payloads[~payloads['payload'].isnull()]\n",
    "print('null/NaN data points removed: ' + str(rows_before-len(payloads)))\n",
    "\n",
    "#Lastly, save to .csv\n",
    "payloads.to_csv('data/payloads.csv',encoding='UTF-8')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
